1. ggplot2 points

library(dplyr)
library(ggplot2)

# 1. max. vol and price
diamonds %>% 
  mutate(theoretical_max_vol = x * y * z) %>% 
  ggplot(aes(x = theoretical_max_vol, y = price)) + geom_point()

## we have an outlier, removing it 
diamonds %>% 
  mutate(theoretical_max_vol = x * y * z) %>% 
  filter(theoretical_max_vol < 3000) %>% 
  ggplot(aes(x = theoretical_max_vol, y = price)) + geom_point()

# 2. aesthetics options
# taking only a smaller subset
set.seed(123)
diamond_df <- diamonds %>% sample_n(1000)

ggplot(diamond_df, aes(x = carat, y = price)) + geom_point()

ggplot(diamond_df, aes(x = carat, y = price, color = cut)) + geom_point()

ggplot(diamond_df, aes(x = carat, y = price, color = color)) + geom_point()

ggplot(diamond_df, aes(x = carat, y = price, color = color, shape = cut)) + geom_point()

ggplot(diamond_df, aes(x = carat, y = price)) + geom_point(alpha = 0.5)

# 3. assign plot to a variable
plot1 <- ggplot(diamond_df, aes(x = carat, y = price)) + geom_point()
plot1 + ggtitle("This is a title")

ggsave("plot1.pdf", plot1, width = 7, height = 5)
ggsave("plot1.png", plot1, width = 7, height = 5)

Explain the differences:

# 1. data and aesthetics are set globally (at least for this plot)
ggplot(df, aes(x = time, y = value, color = symbol)) + 
  geom_line()

# 2. data is set globally (this plot), aesthetics are set only for geom_line()
ggplot(df) + 
  geom_line(aes(x = time, y = value, color = symbol))

# 3. data and aesthetics are set only for geom_line()
ggplot() + 
  geom_line(data = df, aes(x = time, y = value, color = symbol))

# 7: Yes, the code would work, data, x, and color are set globally (for the plot), y is set for geom_line() only
ggplot(df, aes(x = time, color = symbol)) + 
  geom_line(aes(y = value))

2. Geometrics

2D Random Walk

# 1. create some data
set.seed(123)
rwalk <- tibble(
  id = 1:10000,
  x = cumsum(rnorm(10000)),
  y = cumsum(rnorm(10000))
)

ggplot(rwalk, aes(x = x, y = y, color = id)) + geom_path()

stock returns

library(tidyquant)

dow_constituents <- tq_index("dow-jones")
dow_stocks <- tq_get(dow_constituents$symbol, from = "2010-01-01", to = "2015-12-31")

dow_df <- left_join(dow_stocks, dow_constituents, by = "symbol")
# plot the lines individually
ggplot(dow_df, aes(x = date, y = adjusted, color = company)) + 
  geom_line() + 
  theme(legend.position = "none") # remove legend...

# indexed plot
dow_df %>% 
  group_by(company) %>% 
  mutate(idx_price = adjusted / adjusted[1] * 100) %>% 
ggplot(aes(x = date, y = idx_price, color = company)) + 
  geom_line() + 
  theme(legend.position = "none") # remove legend...

# density plot of returns
dow_df %>% 
  group_by(company) %>% 
  mutate(returns = adjusted / lag(adjusted) - 1) %>% 
  ggplot(aes(x = returns, fill = company)) + 
  geom_density(alpha = 0.1) + 
  scale_x_continuous(limits = c(-0.1, 0.1)) + # limit the display to -0.1 til 0.1
  theme(legend.position = "none") # remove legend...

finance market structure

goog_df <- tq_get("GOOG", from = "2016-01-01", to = "2016-12-31")

ggplot(goog_df, aes(x = date, y = adjusted)) + 
  geom_ribbon(aes(ymin = low, ymax = high), fill = "orange") +
  geom_line()

ggplot(goog_df, aes(x = date, y = adjusted)) + 
  geom_ribbon(aes(ymin = low, ymax = high), fill = "lightgreen") +
  geom_step()

World Bank

library(wbstats)

wb_df <- wb(indicator = "SP.POP.TOTL", startdate = 2000, enddate = 2016)

wb_selected_df <- wb_df %>% 
  filter(country %in% c("Germany", "European Union", "France", "United States")) %>%
  mutate(date = as.numeric(date))

ggplot(wb_selected_df, aes(x = date, y = value, color = country)) + geom_line()

wb_selected_df %>% 
  group_by(country) %>% 
  arrange(date) %>% 
  mutate(diff = value / lag(value) - 1) %>% 
  ggplot(aes(x = date, y = diff, color = country)) + geom_boxplot()
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

Advanced Gapminder

library(gapminder)

gapminder %>% 
  filter(year == 2007) %>% 
  ggplot(aes(x = gdpPercap, y = lifeExp, color = continent, size = pop)) + 
  geom_point() +
  scale_x_log10()

3. Functions

# 1. trigonometry
fun1 <- function(x) sin(x + 1) + 3
fun2 <- function(x) cos(-x)* 3

ggplot(tibble(x = -10:10), aes(x = x)) + 
  stat_function(fun = fun1, n = 1000, color = "blue") + 
  stat_function(fun = fun2, n = 1000, color = "red") + 
  scale_y_continuous(limits = c(-10, 10))

# 2. distributions
ggplot(tibble(x = -3:3), aes(x = x)) +
  stat_function(fun = dnorm, aes(color = "normal")) +
  stat_function(fun = dunif, aes(color = "uniform")) +
  stat_function(fun = dt, aes(color = "t"), args = list(df = 2)) +
  stat_function(fun = df, aes(color = "F"), args = list(df1 = 1, df2 = 20), n = 1000) +
  stat_function(fun = dchisq, aes(color = "chisq"), args = list(df = 2), n = 1000) +
  scale_y_continuous(limits = c(0, 3))

# capm
capm_upper <- function(sd) {
  div <- 147.8/4037.6
  x <- div + sqrt(div^2 - (6 - 2339.9 * sd^2) / 4037.6)
  return(x)
}
capm_lower <- function(sd) {
  div <- 147.8/4037.6
  x <- div - sqrt(div^2 - (6 - 2339.9 * sd^2) / 4037.6)
  return(x)
}

ggplot(tibble(sd = c(0, 0.03)), aes(x = sd)) +
  stat_function(fun = capm_upper, n = 10000, aes(color = "upper")) +
  stat_function(fun = capm_lower, n = 10000, aes(color = "lower")) +
  labs(title = "The Efficient Frontier", caption = "Source: DataShenanigans",
       x = "Risk", y = "Expected Returns", color = "Efficient\nFrontier")
## Warning in sqrt(div^2 - (6 - 2339.9 * sd^2)/4037.6): NaNs produced

## Warning in sqrt(div^2 - (6 - 2339.9 * sd^2)/4037.6): NaNs produced
## Warning: Removed 5291 rows containing missing values (geom_path).

## Warning: Removed 5291 rows containing missing values (geom_path).

4. Colors

# 1. Diamond plot
set.seed(42)
diamond_df <- diamonds %>% sample_n(1000)
diamond_plot <- ggplot(diamond_df, aes(x = carat, y = price, color = cut)) +
  geom_point()

diamond_plot

diamond_plot + scale_color_brewer(palette = "Dark2")

# 2. skipped

# 3. colors are not displayed properly because we map the "drv" to fill, but specify color (usage of "scale_fill_manual" instead of "scale_color_manual" would resolve the issue)

5. Annotations

library(ggrepel)

ggplot(filter(mpg, manufacturer == "audi"), aes(x = displ, y = cty, label = model)) + 
  geom_point() + 
  geom_label()

ggplot(filter(mpg, manufacturer == "audi"), aes(x = displ, y = cty, label = model)) + 
  geom_point() + 
  geom_label_repel()

6. Themes

library(ggthemes)
diamond_plot + theme_economist() + scale_color_economist()

diamond_plot + theme_gdocs() + scale_color_gdocs()

diamond_plot + theme_solarized() + scale_color_solarized()

diamond_plot + theme_wsj() + scale_color_wsj()

7. Facets

library(nycflights13)
library(lubridate)
library(scales)

flights_df <- flights %>% 
  sample_n(10000) %>% 
  mutate(dep_time2 = ymd_hm(paste("2000-01-01", hour, minute)))
  
delay_plot <- ggplot(flights_df, aes(x = dep_time2, y = arr_delay)) +
  geom_point() + 
  geom_smooth(method = "lm") +
  scale_x_datetime(labels = date_format("%H:%M"))

delay_plot

delay_plot + facet_wrap(~origin)

# see: http://stackoverflow.com/a/12104207/3048453
facet_labeller <- function(variable, value){
  return(facet_names[value])
}
facet_names <- list(
  "EWR" = "Newark Airport",
  "JFK" = "JFK Airport",
  "LGA" = "LaGuardia Airport"
)
delay_plot + facet_wrap(~origin, labeller = facet_labeller)

8. Maps

USA-map from the workshop

library(ggmap)

state_df <- map_data("state") %>% as_data_frame()

ggplot(state_df, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = "white", color = "black") +
  theme_map() + coord_map()

# include the population data
## gather the population data from library(datasets)
library(datasets)

stats_df <- tibble(
  region = tolower(state.name),
  long_center = state.center$x,
  lat_center = state.center$y,
  pop = state.x77[,"Population"],
  lifeexp = state.x77[, "Life Exp"]
)
usa_df <- left_join(state_df, stats_df, by = "region")
labels_df <- usa_df %>% select(region, long_center, lat_center, group) %>% distinct()

ggplot() + 
  geom_polygon(data = usa_df, aes(x = long, y = lat, group = group, fill = pop/1000)) +
  theme_map() + 
  coord_map() +
  labs(title = "Population of US States",
       subtitle = "As of 1970",
       fill = "Population\nin Mio") 

ggplot() + 
  geom_polygon(data = usa_df, aes(x = long, y = lat, group = group, fill = pop/1000)) +
  theme_map() + 
  coord_map() +
  labs(title = "Population of US States",
       subtitle = "As of 1970",
       fill = "Population\nin Mio") +
  geom_label_repel(data = labels_df, aes(x = long_center, y = lat_center, label = region))

Earthquake map

earthquakes <- read_csv("http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_month.csv")

world_df <-map_data("world") %>% as_data_frame()

ggplot(world_df, aes(x = long, y = lat, group = group)) +
  geom_polygon() + 
  geom_point(data = earthquakes, aes(x = longitude, y = latitude, size = mag,
                                     group = 1), color = "red", alpha = 0.1) +
  scale_size_continuous(name = "Magnitude", range = c(0,4)) +
  theme_map() +
  labs(title = "Global Earthquakes", subtitle = "Last 30 days", 
       caption = "Source: USGS as of 2017-04-16") +
  guides(size = guide_legend(override.aes = list(alpha = 1)))